{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "title: Preprocessing\n", "description: Preprocessing the data for future use\n", "---" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from IPython.display import display, HTML" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing match results" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# Column names : 'League', 'Country', 'Season', 'Date', 'Home', 'HomeGoals', 'Away', 'AwayGoals'\n", "match_results = pd.read_csv('data/extracted_match_results.csv', parse_dates=['Date'])\n", "\n", "# Fix encoding issue : renaming 'Fu\\303\\237ball-Bundesliga' to 'Bundesliga'\n", "match_results['League'] = match_results['League'].replace('Fuball-Bundesliga', 'Bundesliga')\n", "\n", "# Map country codes to country names\n", "country_map = {'ENG': 'England', 'ITA': 'Italy', 'FRA': 'France', 'GER': 'Germany', 'ESP': 'Spain', 'POR': 'Portugal', 'SCO': 'Scotland', 'POL': 'Poland', 'GRE': 'Greece', 'TUR': 'Turkey', 'SUI': 'Switzerland', 'NED': 'Netherlands', 'BEL': 'Belgium', 'AUT': 'Austria'}\n", "match_results['Country'] = match_results['Country'].map(country_map)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SeasonDateHomeGoalsAwayGoals
count16463.0000001646316362.00000016362.000000
mean2019.0004862019-01-10 07:48:02.7552693761.5364871.213299
min2015.0000002014-08-08 00:00:000.0000000.000000
25%2017.0000002016-10-29 00:00:001.0000000.000000
50%2019.0000002019-01-11 00:00:001.0000001.000000
75%2021.0000002021-03-21 00:00:002.0000002.000000
max2023.0000002023-06-11 00:00:0010.0000009.000000
std2.581816NaN1.3106441.166416
\n", "
" ], "text/plain": [ " Season Date HomeGoals AwayGoals\n", "count 16463.000000 16463 16362.000000 16362.000000\n", "mean 2019.000486 2019-01-10 07:48:02.755269376 1.536487 1.213299\n", "min 2015.000000 2014-08-08 00:00:00 0.000000 0.000000\n", "25% 2017.000000 2016-10-29 00:00:00 1.000000 0.000000\n", "50% 2019.000000 2019-01-11 00:00:00 1.000000 1.000000\n", "75% 2021.000000 2021-03-21 00:00:00 2.000000 2.000000\n", "max 2023.000000 2023-06-11 00:00:00 10.000000 9.000000\n", "std 2.581816 NaN 1.310644 1.166416" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "match_results.describe()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LeagueCountrySeasonDateHomeHomeGoalsAwayAwayGoals
0Premier LeagueEngland20152014-08-16Manchester Utd1.0Swansea City2.0
1Premier LeagueEngland20152014-08-16Stoke City0.0Aston Villa1.0
2Premier LeagueEngland20152014-08-16Leicester City2.0Everton2.0
3Premier LeagueEngland20152014-08-16QPR0.0Hull City1.0
4Premier LeagueEngland20152014-08-16West Ham0.0Tottenham1.0
\n", "
" ], "text/plain": [ " League Country Season Date Home HomeGoals \\\n", "0 Premier League England 2015 2014-08-16 Manchester Utd 1.0 \n", "1 Premier League England 2015 2014-08-16 Stoke City 0.0 \n", "2 Premier League England 2015 2014-08-16 Leicester City 2.0 \n", "3 Premier League England 2015 2014-08-16 QPR 0.0 \n", "4 Premier League England 2015 2014-08-16 West Ham 0.0 \n", "\n", " Away AwayGoals \n", "0 Swansea City 2.0 \n", "1 Aston Villa 1.0 \n", "2 Everton 2.0 \n", "3 Hull City 1.0 \n", "4 Tottenham 1.0 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#| label: match_results\n", "match_results.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "def return_result(goal1, goal2):\n", " if goal1 > goal2:\n", " return 'win'\n", " elif goal1 < goal2:\n", " return 'loss'\n", " else:\n", " return 'draw'\n", "\n", "match_results['HomeResult'] = match_results.apply(return_result, axis=1, args=('HomeGoals', 'AwayGoals'))\n", "match_results['AwayResult'] = match_results.apply(return_result, axis=1, args=('AwayGoals', 'HomeGoals'))\n", "\n", "home_results = match_results[['League', 'Country', 'Date', 'Home', 'HomeGoals', 'HomeResult']]\n", "home_results = home_results.rename(columns={'Home': 'Team', 'HomeGoals': 'Goals', 'HomeResult': 'Result'})\n", "home_results['isHome'] = True\n", "\n", "away_results = match_results[['League', 'Country', 'Away', 'Date', 'AwayGoals', 'AwayResult']]\n", "away_results = away_results.rename(columns={'Away': 'Team', 'AwayGoals': 'Goals', 'AwayResult': 'Result'})\n", "away_results['isHome'] = False\n", "\n", "match_results = pd.concat([home_results, away_results], ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing head coach" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamLeagueCountryHeadCoachAppointedEndDateTenureMatchesWinsDrawsLosses
0Chelsea FCPremier LeagueEnglandMauricio Pochettino2023-07-01NaT2964422913
1Chelsea FCPremier LeagueEnglandGraham Potter2022-09-082023-04-022063112811
2Chelsea FCPremier LeagueEnglandThomas Tuchel2021-01-262022-09-07589100631918
3Chelsea FCPremier LeagueEnglandFrank Lampard2019-07-042021-01-2557184441525
4Chelsea FCPremier LeagueEnglandMaurizio Sarri2018-07-142019-06-3035163401112
\n", "
" ], "text/plain": [ " Team League Country HeadCoach Appointed \\\n", "0 Chelsea FC Premier League England Mauricio Pochettino 2023-07-01 \n", "1 Chelsea FC Premier League England Graham Potter 2022-09-08 \n", "2 Chelsea FC Premier League England Thomas Tuchel 2021-01-26 \n", "3 Chelsea FC Premier League England Frank Lampard 2019-07-04 \n", "4 Chelsea FC Premier League England Maurizio Sarri 2018-07-14 \n", "\n", " EndDate Tenure Matches Wins Draws Losses \n", "0 NaT 296 44 22 9 13 \n", "1 2023-04-02 206 31 12 8 11 \n", "2 2022-09-07 589 100 63 19 18 \n", "3 2021-01-25 571 84 44 15 25 \n", "4 2019-06-30 351 63 40 11 12 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#| label: head_coach\n", "\n", "# Column names : 'Team', 'League', 'Country', 'HeadCoach', 'Appointed', 'EndDate', 'Tenure', 'Matches', 'Wins', 'Draws', 'Losses'\n", "head_coach = pd.read_csv('data/extracted_head_coach.csv', parse_dates=['Appointed', 'EndDate'])\n", "head_coach.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AppointedEndDateTenureMatchesWinsDrawsLosses
count353234683532.0000003532.0000003532.0000003532.0000003532.000000
mean1982-05-15 03:53:36.7610419201983-04-16 22:00:00608.21234451.58550422.52633113.00622916.052945
min1886-06-26 00:00:001893-08-01 00:00:00-242.0000000.0000000.0000000.0000000.000000
25%1961-11-02 06:00:001963-06-30 00:00:00186.00000010.0000002.0000002.0000004.000000
50%1987-07-01 00:00:001988-03-06 00:00:00364.00000029.00000010.0000007.00000010.000000
75%2004-12-29 00:00:002005-06-30 00:00:00730.00000067.00000028.00000017.00000021.000000
max2024-04-23 00:00:002024-06-30 00:00:0014613.0000001490.000000895.000000323.000000272.000000
stdNaNNaN815.28398275.76152339.42259718.70784720.552184
\n", "
" ], "text/plain": [ " Appointed EndDate Tenure \\\n", "count 3532 3468 3532.000000 \n", "mean 1982-05-15 03:53:36.761041920 1983-04-16 22:00:00 608.212344 \n", "min 1886-06-26 00:00:00 1893-08-01 00:00:00 -242.000000 \n", "25% 1961-11-02 06:00:00 1963-06-30 00:00:00 186.000000 \n", "50% 1987-07-01 00:00:00 1988-03-06 00:00:00 364.000000 \n", "75% 2004-12-29 00:00:00 2005-06-30 00:00:00 730.000000 \n", "max 2024-04-23 00:00:00 2024-06-30 00:00:00 14613.000000 \n", "std NaN NaN 815.283982 \n", "\n", " Matches Wins Draws Losses \n", "count 3532.000000 3532.000000 3532.000000 3532.000000 \n", "mean 51.585504 22.526331 13.006229 16.052945 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 10.000000 2.000000 2.000000 4.000000 \n", "50% 29.000000 10.000000 7.000000 10.000000 \n", "75% 67.000000 28.000000 17.000000 21.000000 \n", "max 1490.000000 895.000000 323.000000 272.000000 \n", "std 75.761523 39.422597 18.707847 20.552184 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_coach.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We need to filter head coach that were not active between 2015 to 2023." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "last_match = match_results['Date'].max()\n", "first_match = match_results['Date'].min()\n", "\n", "# Remove head coach that were appointed after 2023 season\n", "head_coach = head_coach[head_coach['Appointed'] <= last_match]\n", "# Keep head coach that were dimissed after 2015 or that are still active\n", "head_coach = head_coach[(head_coach['EndDate'] >= first_match) | (head_coach['EndDate'].isna())]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ ":::{caution}\n", "Head coach appointments records, extracted from TransferMarkt, contains data related to head coach in that specific club : tenure, number of matches played, number of matches won, etc. Those datapoint goes beyond our cut-off date of 2023 end season (2024-01-14).\n", "\n", "One important thing is that those feature beyond cut-off date still relate to a head coach appointment we have in our records. This guarantees us that metrics such *number of club head coach managed* are properly reflected and still relate to head coach performance.\n", "\n", "However, this create an asymetry in our data, as certain data point are limited by a time-frame and others not.\n", "Moreover, we must be careful in how we compare these datapoint to others dataset such as match results as it could easily bias our statistical study.\n", "\n", "The only way we combine this dataset to match result is by extracting head coach tenure on day of a match. This does not bias our statistical study as it is a feature that is properly reflected by our cut-off date.\n", ":::\n", "\n", ":::{note}\n", "I have considered computing Head Coach performance metric from match results but we would lose information on prior records as well as creating imbalance data for plot such as linear regression of head coach performance over head coach tenure : a long standing coach which would not been dismissed soon after our start date would have a lower number of matches, thus a performance metric with higher variance that would bias linear regression due to long tenure.\n", ":::" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Ensuring there is only 1 head coach at a time in any given team.\n", "\n", "head_coach_bis = head_coach.copy()\n", "# Sort data by 'Team' and 'Appointed'\n", "head_coach_bis = head_coach_bis.sort_values(['Team', 'Appointed'])\n", "# Fillna with end date of 2022-2023 season\n", "head_coach_bis['EndDate'] = head_coach_bis['EndDate'].fillna('2024-01-14')\n", "# Check if the next appointment is overlapping with the current one\n", "head_coach_bis['OverlapDuration'] = head_coach_bis.groupby('Team')['Appointed'].shift(-1) - head_coach_bis['EndDate']\n", "head_coach_bis['Overlap'] = head_coach_bis['OverlapDuration'].dt.days < 0\n", "\n", "# Show team with overlapping appointments\n", "overlapping = head_coach_bis[head_coach_bis['Overlap']]" ] }, { "cell_type": "markdown", "metadata": { "label": "overlapping_coach", "user_expressions": [ { "expression": "overlapping.shape[0]", "result": { "data": { "text/plain": "11" }, "metadata": {}, "status": "ok" } }, { "expression": "', '.join(overlapping['Team'].unique())", "result": { "data": { "text/plain": "'FC Empoli, Hellas Verona, LOSC Lille, Newcastle United, Rayo Vallecano, SSC Napoli, Stade Reims, Stade Rennais FC, Torino FC, West Ham United'" }, "metadata": {}, "status": "ok" } } ] }, "source": [ "Nous avons trouvé {eval}`overlapping.shape[0]` enregistrements de mandats d'entraîneurs sportifs qui avaient lieu alors qu'un autre chef-entraîneur assurait l'entraînement de l'équipe. Ces enregistrements concernent les équipes suivante : {eval}`', '.join(overlapping['Team'].unique())`. Ces enregistrements sont exclus du jeu de données." ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamLeagueCountryHeadCoachAppointedEndDateTenureMatchesWinsDrawsLossesOverlapDurationOverlap
3422Stade ReimsLigue 1FranceDavid Guion2017-05-222021-05-251464157674644-1059 daysTrue
3259Stade Rennais FCLigue 1FranceRolland Courbis2016-01-202016-09-1123517638-72 daysTrue
3343LOSC LilleLigue 1FranceRené Girard2013-07-012015-06-3072995422429-35 daysTrue
2003Torino FCSerie AItalyGian Piero Ventura2011-06-062016-06-251846217856468-31 daysTrue
192West Ham UnitedPremier LeagueEnglandSam Allardyce2011-07-012015-06-301460181694468-21 daysTrue
\n", "
" ], "text/plain": [ " Team League Country HeadCoach \\\n", "3422 Stade Reims Ligue 1 France David Guion \n", "3259 Stade Rennais FC Ligue 1 France Rolland Courbis \n", "3343 LOSC Lille Ligue 1 France René Girard \n", "2003 Torino FC Serie A Italy Gian Piero Ventura \n", "192 West Ham United Premier League England Sam Allardyce \n", "\n", " Appointed EndDate Tenure Matches Wins Draws Losses \\\n", "3422 2017-05-22 2021-05-25 1464 157 67 46 44 \n", "3259 2016-01-20 2016-09-11 235 17 6 3 8 \n", "3343 2013-07-01 2015-06-30 729 95 42 24 29 \n", "2003 2011-06-06 2016-06-25 1846 217 85 64 68 \n", "192 2011-07-01 2015-06-30 1460 181 69 44 68 \n", "\n", " OverlapDuration Overlap \n", "3422 -1059 days True \n", "3259 -72 days True \n", "3343 -35 days True \n", "2003 -31 days True \n", "192 -21 days True " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_coach_bis.sort_values('OverlapDuration').head()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamAppointedEndDateOverlapOverlapDuration
2248FC Empoli2012-06-252015-06-04False11 days
2247FC Empoli2015-06-152016-06-30False1 days
2246FC Empoli2016-07-012017-06-30False1 days
2245FC Empoli2017-07-012017-12-17False0 days
2244FC Empoli2017-12-172018-11-05False1 days
2243FC Empoli2018-11-062019-03-13False0 days
2242FC Empoli2019-03-132019-06-30True-12 days
2241FC Empoli2019-06-182019-11-12False2 days
2240FC Empoli2019-11-142020-01-26False0 days
2239FC Empoli2020-01-262020-08-11False8 days
\n", "
" ], "text/plain": [ " Team Appointed EndDate Overlap OverlapDuration\n", "2248 FC Empoli 2012-06-25 2015-06-04 False 11 days\n", "2247 FC Empoli 2015-06-15 2016-06-30 False 1 days\n", "2246 FC Empoli 2016-07-01 2017-06-30 False 1 days\n", "2245 FC Empoli 2017-07-01 2017-12-17 False 0 days\n", "2244 FC Empoli 2017-12-17 2018-11-05 False 1 days\n", "2243 FC Empoli 2018-11-06 2019-03-13 False 0 days\n", "2242 FC Empoli 2019-03-13 2019-06-30 True -12 days\n", "2241 FC Empoli 2019-06-18 2019-11-12 False 2 days\n", "2240 FC Empoli 2019-11-14 2020-01-26 False 0 days\n", "2239 FC Empoli 2020-01-26 2020-08-11 False 8 days" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_coach_bis[head_coach_bis['Team'].isin(overlapping['Team'])][['Team', 'Appointed', 'EndDate', 'Overlap', 'OverlapDuration']].sort_values(['Team', 'Appointed']).head(10)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamAppointedEndDateOverlap
3423Stade Reims2016-07-012017-05-22False
3422Stade Reims2017-05-222021-05-25True
3421Stade Reims2018-07-012019-03-30False
3420Stade Reims2021-06-232022-10-13False
\n", "
" ], "text/plain": [ " Team Appointed EndDate Overlap\n", "3423 Stade Reims 2016-07-01 2017-05-22 False\n", "3422 Stade Reims 2017-05-22 2021-05-25 True\n", "3421 Stade Reims 2018-07-01 2019-03-30 False\n", "3420 Stade Reims 2021-06-23 2022-10-13 False" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#| label: hc_inconsistency\n", "\n", "head_coach_bis[head_coach_bis['Team'] == 'Stade Reims'][['Team', 'Appointed', 'EndDate', 'Overlap']].sort_values(['Appointed']).iloc[2:6]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# Check records that are overlapping\n", "head_coach[~head_coach.index.isin(head_coach_bis[head_coach_bis['OverlapDuration'].dt.days <= -20].index)]\n", "# Drop overlapping records with a duration of more than 20 days\n", "head_coach = head_coach[head_coach.index.isin(head_coach_bis[head_coach_bis['OverlapDuration'].dt.days > -20].index)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next we would like to add to each head coach record the number of appointment he is completing." ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamLeagueCountryHeadCoachAppointedEndDateTenureMatchesWinsDrawsLossesAppointmentNumber
2244FC EmpoliSerie AItalyAurelio Andreazzoli2017-12-172018-11-0532335171081
2242FC EmpoliSerie AItalyAurelio Andreazzoli2019-03-132019-06-30109115152
1773Genoa CFCSerie AItalyAurelio Andreazzoli2019-07-012019-10-2211392253
2237FC EmpoliSerie AItalyAurelio Andreazzoli2021-06-212022-06-01345411211184
417Real MadridLaLigaSpainCarlo Ancelotti2013-07-012015-05-256931198914161
\n", "
" ], "text/plain": [ " Team League Country HeadCoach Appointed EndDate \\\n", "2244 FC Empoli Serie A Italy Aurelio Andreazzoli 2017-12-17 2018-11-05 \n", "2242 FC Empoli Serie A Italy Aurelio Andreazzoli 2019-03-13 2019-06-30 \n", "1773 Genoa CFC Serie A Italy Aurelio Andreazzoli 2019-07-01 2019-10-22 \n", "2237 FC Empoli Serie A Italy Aurelio Andreazzoli 2021-06-21 2022-06-01 \n", "417 Real Madrid LaLiga Spain Carlo Ancelotti 2013-07-01 2015-05-25 \n", "\n", " Tenure Matches Wins Draws Losses AppointmentNumber \n", "2244 323 35 17 10 8 1 \n", "2242 109 11 5 1 5 2 \n", "1773 113 9 2 2 5 3 \n", "2237 345 41 12 11 18 4 \n", "417 693 119 89 14 16 1 " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Using cronological information about appointment\n", "# Add a column to head coach records that tells us about how many appointment head coach has done\n", "\n", "head_coach = head_coach.sort_values(['HeadCoach', 'Appointed'])\n", "head_coach['AppointmentNumber'] = head_coach.groupby('HeadCoach').cumcount() + 1\n", "head_coach[head_coach['HeadCoach'].isin(head_coach[head_coach['AppointmentNumber'] > 3]['HeadCoach'])].sort_values('HeadCoach').head()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check if total_matches = wins + draws + losses\n", "head_coach[head_coach['Matches'] != head_coach['Wins'] + head_coach['Draws'] + head_coach['Losses']].shape[0]" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamLeagueCountryHeadCoachAppointedEndDateTenureMatchesWinsDrawsLossesAppointmentNumber
1696SS LazioSerie AItalyMarcelo Bielsa2016-07-062016-07-08200002
3421Stade ReimsLigue 1FranceSébastien Desmazeau2018-07-012019-03-3027200001
2698TSG 1899 HoffenheimBundesligaGermanyMarcel Rapp2020-06-252020-06-30511001
2332Hellas VeronaSerie AItalyEusebio Di Francesco2021-06-072021-09-149941033
1921Udinese CalcioSerie AItalyIgor Tudor2018-04-242018-06-306742111
2761VfB StuttgartBundesligaGermanyNico Willig2019-05-062019-06-305541301
2303Chievo VeronaSerie AItalyGian Piero Ventura2018-10-102018-11-133440131
934Granada CFLaLigaSpainMiguel Ángel Portugal2018-05-012018-06-306052031
2765VfB StuttgartBundesligaGermanyJos Luhukay2016-07-012016-09-157653021
3390Montpellier HSCLigue 1FrancePascal Baills2015-12-272016-01-263051041
245Crystal PalacePremier LeagueEnglandFrank de Boer2017-07-012017-09-117251042
935Granada CFLaLigaSpainPedro Morilla2018-03-192018-04-304261321
1778Genoa CFCSerie AItalyAndrea Mandorlini2017-02-202017-04-104961142
939Granada CFLaLigaSpainPaco Jémez2016-07-012016-09-288960242
2329Hellas VeronaSerie AItalySalvatore Bocchetti2022-10-132022-12-025060061
1201UD Las PalmasLaLigaSpainManolo Márquez2017-07-032017-09-268562041
1560AC MilanSerie AItalyMarco Giampaolo2019-06-192019-10-0811173042
2334Hellas VeronaSerie AItalyAlfredo Aglietti2019-05-022019-06-305974121
937Granada CFLaLigaSpainTony Adams2017-04-102017-06-308170071
1775Genoa CFCSerie AItalyIvan Juric2018-10-092018-12-065880353
\n", "
" ], "text/plain": [ " Team League Country HeadCoach \\\n", "1696 SS Lazio Serie A Italy Marcelo Bielsa \n", "3421 Stade Reims Ligue 1 France Sébastien Desmazeau \n", "2698 TSG 1899 Hoffenheim Bundesliga Germany Marcel Rapp \n", "2332 Hellas Verona Serie A Italy Eusebio Di Francesco \n", "1921 Udinese Calcio Serie A Italy Igor Tudor \n", "2761 VfB Stuttgart Bundesliga Germany Nico Willig \n", "2303 Chievo Verona Serie A Italy Gian Piero Ventura \n", "934 Granada CF LaLiga Spain Miguel Ángel Portugal \n", "2765 VfB Stuttgart Bundesliga Germany Jos Luhukay \n", "3390 Montpellier HSC Ligue 1 France Pascal Baills \n", "245 Crystal Palace Premier League England Frank de Boer \n", "935 Granada CF LaLiga Spain Pedro Morilla \n", "1778 Genoa CFC Serie A Italy Andrea Mandorlini \n", "939 Granada CF LaLiga Spain Paco Jémez \n", "2329 Hellas Verona Serie A Italy Salvatore Bocchetti \n", "1201 UD Las Palmas LaLiga Spain Manolo Márquez \n", "1560 AC Milan Serie A Italy Marco Giampaolo \n", "2334 Hellas Verona Serie A Italy Alfredo Aglietti \n", "937 Granada CF LaLiga Spain Tony Adams \n", "1775 Genoa CFC Serie A Italy Ivan Juric \n", "\n", " Appointed EndDate Tenure Matches Wins Draws Losses \\\n", "1696 2016-07-06 2016-07-08 2 0 0 0 0 \n", "3421 2018-07-01 2019-03-30 272 0 0 0 0 \n", "2698 2020-06-25 2020-06-30 5 1 1 0 0 \n", "2332 2021-06-07 2021-09-14 99 4 1 0 3 \n", "1921 2018-04-24 2018-06-30 67 4 2 1 1 \n", "2761 2019-05-06 2019-06-30 55 4 1 3 0 \n", "2303 2018-10-10 2018-11-13 34 4 0 1 3 \n", "934 2018-05-01 2018-06-30 60 5 2 0 3 \n", "2765 2016-07-01 2016-09-15 76 5 3 0 2 \n", "3390 2015-12-27 2016-01-26 30 5 1 0 4 \n", "245 2017-07-01 2017-09-11 72 5 1 0 4 \n", "935 2018-03-19 2018-04-30 42 6 1 3 2 \n", "1778 2017-02-20 2017-04-10 49 6 1 1 4 \n", "939 2016-07-01 2016-09-28 89 6 0 2 4 \n", "2329 2022-10-13 2022-12-02 50 6 0 0 6 \n", "1201 2017-07-03 2017-09-26 85 6 2 0 4 \n", "1560 2019-06-19 2019-10-08 111 7 3 0 4 \n", "2334 2019-05-02 2019-06-30 59 7 4 1 2 \n", "937 2017-04-10 2017-06-30 81 7 0 0 7 \n", "1775 2018-10-09 2018-12-06 58 8 0 3 5 \n", "\n", " AppointmentNumber \n", "1696 2 \n", "3421 1 \n", "2698 1 \n", "2332 3 \n", "1921 1 \n", "2761 1 \n", "2303 1 \n", "934 1 \n", "2765 1 \n", "3390 1 \n", "245 2 \n", "935 1 \n", "1778 2 \n", "939 2 \n", "2329 1 \n", "1201 1 \n", "1560 2 \n", "2334 1 \n", "937 1 \n", "1775 3 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Display Head Coach with lowest number of match\n", "display(head_coach.sort_values('Matches').head(20))\n", "# Remove head coach records with less than 5 matches\n", "# We consider them to be not relevant as it a short stay of a head coach indicate either a temporary replacement or a very bad performance\n", "head_coach = head_coach[head_coach['Matches'] > 5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Joining head coach with match results" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(59, 153)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute number of team that are in head_coach but not in match_results\n", "coach_teams = set(head_coach['Team'])\n", "match_teams = set(match_results['Team'])\n", "\n", "coach_team_not_in_match = coach_teams - match_teams\n", "match_team_not_in_coach = match_teams - coach_teams\n", "\n", "len(coach_team_not_in_match), len(match_team_not_in_coach)" ] }, { "cell_type": "markdown", "metadata": { "label": "inconsistent_team_names", "user_expressions": [ { "expression": "len(match_teams)", "result": { "data": { "text/plain": "161" }, "metadata": {}, "status": "ok" } }, { "expression": "len(coach_teams)", "result": { "data": { "text/plain": "67" }, "metadata": {}, "status": "ok" } }, { "expression": "len(coach_team_not_in_match)", "result": { "data": { "text/plain": "59" }, "metadata": {}, "status": "ok" } } ] }, "source": [ "Les résultats de matchs contiennent {eval}`len(match_teams)` équipes et les mandats de coachs contiennent {eval}`len(coach_teams)` équipes. Cependant, le nom de certaines équipes est différent entre les deux jeux de données. Par exemple, « Liverpool » dans les résultats de match devient « Liverpool FC » dans les mandats des entraîneurs.\n", "\n", "Il existe {eval}`len(coach_team_not_in_match)` équipes présentes dans les mandats de coachs qui n'ont pas de correspondance dans les résultats de match." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(Country\n", " England [Newcastle United, Crystal Palace, Chelsea FC,...\n", " France [OGC Nice, FC Toulouse, Olympique Marseille, F...\n", " Germany [1.FC Köln, 1.FSV Mainz 05, Eintracht Frankfur...\n", " Italy [Frosinone Calcio, FC Empoli, Genoa CFC, Hella...\n", " Spain [Granada CF, Valencia CF, Celta de Vigo, Real ...\n", " Name: Team, dtype: object,\n", " Country\n", " England [Manchester Utd, Stoke City, Leicester City, Q...\n", " France [Reims, Nice, Guingamp, Nantes, Evian, Lille, ...\n", " Germany [Bayern Munich, Hoffenheim, Hannover 96, Köln,...\n", " Italy [Chievo, Roma, Atalanta, Milan, Genoa, Palermo...\n", " Spain [Málaga, Granada, Sevilla, Almería, Eibar, Cel...\n", " Name: Team, dtype: object)" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Group teams by country\n", "coach_teams_by_country = head_coach.groupby('Country')['Team'].unique()\n", "match_teams_by_country = match_results.groupby('Country')['Team'].unique()\n", "\n", "coach_teams_by_country, match_teams_by_country" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "from thefuzz import process\n", "\n", "team_name_mapping = {}\n", "\n", "# For each country\n", "for country in coach_teams_by_country.index:\n", " # Get teams for this country\n", " coach_teams = coach_teams_by_country[country]\n", " match_teams = match_teams_by_country.get(country, [])\n", "\n", " # For each team in coach_teams\n", " for coach_team in coach_teams:\n", " # Find the best match in match_teams\n", " matching_scores = process.extract(coach_team, match_teams, limit=1)\n", "\n", " if len(matching_scores) != 0 and matching_scores[0][1] >= 60:\n", " team_name_mapping[coach_team] = matching_scores[0][0]\n", " else:\n", " team_name_mapping[coach_team] = None\n", " print(f\"No match found for {coach_team} among {match_teams} in {country}\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Team in head coach recordsTeam in match results
Newcastle UnitedNewcastle Utd
Crystal PalaceCrystal Palace
Chelsea FCChelsea
Arsenal FCArsenal
Liverpool FCLiverpool
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#| label: team_match_table\n", "\n", "name_match = pd.DataFrame(team_name_mapping.items(), columns=['Team in head coach records', 'Team in match results'])\n", "display(HTML(name_match.head().to_html(index=False)))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TeamLeagueCountryHeadCoachAppointedEndDateTenureMatchesWinsDrawsLossesAppointmentNumber
942GranadaLaLigaSpainAbel Resino2015-01-192015-05-01102152581
2935KölnBundesligaGermanyAchim Beierlorzer2019-07-012019-11-09131133191
2716Mainz 05BundesligaGermanyAchim Beierlorzer2019-11-182020-09-283152694132
2819Eint FrankfurtBundesligaGermanyAdi Hütter2018-07-012021-06-3010951416731431
2662M'GladbachBundesligaGermanyAdi Hütter2021-07-012022-06-3036437149142
\n", "
" ], "text/plain": [ " Team League Country HeadCoach Appointed \\\n", "942 Granada LaLiga Spain Abel Resino 2015-01-19 \n", "2935 Köln Bundesliga Germany Achim Beierlorzer 2019-07-01 \n", "2716 Mainz 05 Bundesliga Germany Achim Beierlorzer 2019-11-18 \n", "2819 Eint Frankfurt Bundesliga Germany Adi Hütter 2018-07-01 \n", "2662 M'Gladbach Bundesliga Germany Adi Hütter 2021-07-01 \n", "\n", " EndDate Tenure Matches Wins Draws Losses AppointmentNumber \n", "942 2015-05-01 102 15 2 5 8 1 \n", "2935 2019-11-09 131 13 3 1 9 1 \n", "2716 2020-09-28 315 26 9 4 13 2 \n", "2819 2021-06-30 1095 141 67 31 43 1 \n", "2662 2022-06-30 364 37 14 9 14 2 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Map head_coach['team'] with name_match\n", "head_coach['Team'] = head_coach['Team'].map(team_name_mapping)\n", "head_coach.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nous pouvons maintenant ajouter le nombres de jours au poste d’entraîneur avec les résultats de match." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LeagueCountryDateTeamGoalsResultisHome
8955Ligue 1France2020-03-18StrasbourgNaNdrawTrue
8956Ligue 1France2020-03-13LyonNaNdrawTrue
8957Ligue 1France2020-03-14MontpellierNaNdrawTrue
8958Ligue 1France2020-03-14NantesNaNdrawTrue
8959Ligue 1France2020-03-14StrasbourgNaNdrawTrue
........................
25524Ligue 1France2020-05-23NiceNaNdrawFalse
25525Ligue 1France2020-05-23BrestNaNdrawFalse
25526Ligue 1France2020-05-23Paris S-GNaNdrawFalse
25527Ligue 1France2020-05-23LilleNaNdrawFalse
25528Ligue 1France2020-05-23DijonNaNdrawFalse
\n", "

202 rows × 7 columns

\n", "
" ], "text/plain": [ " League Country Date Team Goals Result isHome\n", "8955 Ligue 1 France 2020-03-18 Strasbourg NaN draw True\n", "8956 Ligue 1 France 2020-03-13 Lyon NaN draw True\n", "8957 Ligue 1 France 2020-03-14 Montpellier NaN draw True\n", "8958 Ligue 1 France 2020-03-14 Nantes NaN draw True\n", "8959 Ligue 1 France 2020-03-14 Strasbourg NaN draw True\n", "... ... ... ... ... ... ... ...\n", "25524 Ligue 1 France 2020-05-23 Nice NaN draw False\n", "25525 Ligue 1 France 2020-05-23 Brest NaN draw False\n", "25526 Ligue 1 France 2020-05-23 Paris S-G NaN draw False\n", "25527 Ligue 1 France 2020-05-23 Lille NaN draw False\n", "25528 Ligue 1 France 2020-05-23 Dijon NaN draw False\n", "\n", "[202 rows x 7 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Check match with NaN goals\n", "display(match_results[match_results['Goals'].isna()])\n", "# Remove match with NaN goals\n", "match_results = match_results[~match_results['Goals'].isna()]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LeagueCountryDateTeamGoalsResultisHomeHeadCoachDaysInPost
0Premier LeagueEngland2014-08-16Manchester Utd1.0lossTrueLouis van Gaal33.0
1Premier LeagueEngland2014-08-16Arsenal2.0winTrueArsène Wenger6528.0
2Premier LeagueEngland2014-08-17Liverpool2.0winTrueBrendan Rodgers777.0
3Premier LeagueEngland2014-08-17Newcastle Utd0.0lossTrueAlan Pardew1347.0
4Premier LeagueEngland2014-11-01Newcastle Utd1.0winTrueAlan Pardew1423.0
\n", "
" ], "text/plain": [ " League Country Date Team Goals Result isHome \\\n", "0 Premier League England 2014-08-16 Manchester Utd 1.0 loss True \n", "1 Premier League England 2014-08-16 Arsenal 2.0 win True \n", "2 Premier League England 2014-08-17 Liverpool 2.0 win True \n", "3 Premier League England 2014-08-17 Newcastle Utd 0.0 loss True \n", "4 Premier League England 2014-11-01 Newcastle Utd 1.0 win True \n", "\n", " HeadCoach DaysInPost \n", "0 Louis van Gaal 33.0 \n", "1 Arsène Wenger 6528.0 \n", "2 Brendan Rodgers 777.0 \n", "3 Alan Pardew 1347.0 \n", "4 Alan Pardew 1423.0 " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#| label: final_match_results\n", "\n", "# Merge head_coach with match_results\n", "match = match_results.merge(head_coach[['Team', 'HeadCoach', 'Appointed', 'EndDate']], on=['Team'], how='left')\n", "# Put aside team that don't have a head coach\n", "no_headcoach = match[match['HeadCoach'].isna()]\n", "match = match[~match['HeadCoach'].isna()]\n", "# Filter match_results_bis to keep only head coach that were appointed before the match and with no end date or end date after the match\n", "match = match[\n", " (match['Date'] >= match['Appointed']) &\n", " ((match['Date'] <= match['EndDate']) | match['EndDate'].isna())]\n", "# Join back the team that don't have a head coach\n", "match = pd.concat([match, no_headcoach], ignore_index=True)\n", "# Compute daysInPost\n", "match['DaysInPost'] = (match['Date'] - match['Appointed']).dt.days\n", "match = match.drop(columns=['Appointed', 'EndDate'])\n", "match.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Missing head coach data \n", "\n", "Il existe des matchs sur les lesquels nous n'avons pas d'information sur le coach en poste." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "95\n", "['Ajaccio', 'Alavés', 'Almería', 'Amiens', 'Angers', 'Arminia', 'Atlético Madrid', 'Auxerre', 'Benevento', 'Bochum', 'Bordeaux', 'Braunschweig', 'Brentford', 'Brescia', 'Brest', 'Brighton', 'Burnley', 'Caen', 'Cagliari', 'Cardiff City', 'Carpi', 'Cesena', 'Clermont Foot', 'Cremonese', 'Crotone', 'Cádiz', 'Córdoba', 'Dijon', 'Düsseldorf', 'Eibar', 'Elche', 'Espanyol', 'Evian', 'Freiburg', 'Fulham', 'Gazélec Ajaccio', 'Girona', 'Greuther Fürth', 'Guingamp', 'Hamburger SV', 'Hannover 96', 'Heidenheim', 'Hertha BSC', 'Holstein Kiel', 'Huddersfield', 'Huesca', 'Hull City', 'Ingolstadt 04', 'Inter', 'Karlsruher', 'La Coruña', 'Lecce', 'Leeds United', 'Leganés', 'Leicester City', 'Lens', 'Levante', 'Mallorca', 'Metz', 'Middlesbrough', 'Monza', 'Málaga', 'Nancy', 'Norwich City', \"Nott'ham Forest\", 'Nîmes', 'Nürnberg', 'Osasuna', 'Paderborn 07', 'Palermo', 'Parma', 'Pescara', 'QPR', 'RB Leipzig', 'Rennes', 'SPAL', 'Saint-Étienne', 'Salernitana', 'Sampdoria', 'Schalke 04', 'Sheffield Utd', 'Southampton', 'Spezia', 'Sporting Gijón', 'Stoke City', 'Strasbourg', 'Sunderland', 'Swansea City', 'Troyes', 'Union Berlin', 'Valladolid', 'Venezia', 'Watford', 'West Brom', 'Wolves']\n" ] } ], "source": [ "# Values count of coach record / no coach record per team\n", "team_hc_count = match.groupby('Team')['HeadCoach'].count()\n", "# Display list of team with no head coach record\n", "team_no_hc_record = team_hc_count[team_hc_count == 0].index.tolist()\n", "print(len(team_no_hc_record))\n", "print(team_no_hc_record)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Saving preprocessed data" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Save match_results\n", "match.to_csv('data/match_results.csv', index=False)\n", "head_coach.to_csv('data/head_coach.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python (TER)", "language": "python", "name": "ter" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 4 }